#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 22 14:26:08 2019

@author: maximilianzeyda
"""

import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sn
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from scipy import stats
from statsmodels.compat import lzip
import statsmodels
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
from sklearn.tree import export_graphviz
import pydot
from subprocess import call
from IPython.display import Image
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
from sklearn.inspection import permutation_importance
import eli5
from eli5.sklearn import PermutationImportance
from IPython.core.display import display, HTML
from sklearn import metrics
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer



pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)           



class SensorData(object):
    
    def __init__ (self, filename):
        
        """
        Initialize a SensorData instance, which stores the sensor data points
        loaded from a given csv file specified by filename.

        Args:
            filename: name of the csv file (str)
        
        """

        self.rawdata = {} #create main dictionary with raw sensor data. Keys: mood_data_id. Values: sensor data stored in lists within a list
        self.type_dict = {} #helper dictionary
        self.data_dict={} #helper dictionary
        count=0 # counts sensor data inputs for methodology part
        f = open(filename, 'r') #reads file with sensor data
        header = f.readline().split(';') #converts header into a list
        for line in f: #reads line after line from csv file (excluding the header)
            count +=1
            items = line.split(';') #splits the sinlge line and temporarily stores it as a list
            mood_data_id = items[header.index('mood_data_id')] #assigns the mood_data_id to a variable
            self.rawdata[mood_data_id] = [] #creates an empty list for each unique mood_data_id


        f = open(filename, 'r') #opens csv file
        header = f.readline().split(';') #splits the header into a list, each item being a single string
        for line in f:
            items = line.split(';') #splits each line into a list
            mood_data_id = items[header.index('mood_data_id')] #gets the unique mood_data_id
            if len(mood_data_id) >0:
                self.rawdata[mood_data_id].append(items) #appends the line in form of a list to each mood_data_id

        del self.rawdata[''] #deletes the empty key
        
        for i in range (len(header)):
            self.type_dict[header[i]]=i #creates a dictionary with each header item as key and its index as value
        
        #print(count) counts total sensor data inputs
        f.close()


    def get_data_list(self, mood_data_id, data, typ, toShow = False):
        """
        Get the list of all sensor data for the given mood_data_id, data (e.g. Heartrate) and type (e.g.average).

        Args:
            mood_data_id: The mood data id to be observed (str)
            data: the parameter to be observed, e.g. Heartrate (str)
            typ: type meaning average or stdDev, min, max etc. (str)
            toShow: True if a plot has to be printed to the console
        Returns:
            a list with the data in the wanted type of the observed mood_data_id
            
        Additional comment:
            Data was not properly converted from US to German version of Excel.
            Thus, the data had to be manipulated accordingly
        """
        list1 = self.rawdata[mood_data_id]
        typ_nr=0
        
        for a in self.type_dict:
            if typ == a:
                typ_nr = self.type_dict[a]

    
        result=[]
        
        for i in range(len(list1)):
            for j in range(len(list1[i])):
                if data == list1[i][3]:
                    to_append = list1[i][typ_nr]
    
                    if data == 'Heartrate':
                        if len(to_append) > 6:
                            to_append = to_append[:3]
                        to_append = float(to_append)
                        if typ == 'average':
                            if to_append > 160:
                                to_append = to_append/10 
                        elif typ == 'stdDev':
                            if to_append > 99:
                                to_append = to_append/10
                        

                    if 'Accelerometer' in data:
                        if len(to_append) > 6:
                            to_append = to_append[:3]
                        to_append = float(to_append)
    
                    if 'Microphone' in data:
                        if len(to_append) > 6:
                            to_append = to_append[:6]
                        to_append = float(to_append)
                    result.append(to_append)
                    break
    
        
        x = []
        for k in range(len(result)):
            x.append(k+1)
        if toShow:
            coef = np.polyfit(x, result,2)
            y_pred = []
            p = np.poly1d(coef) #fit the model
            for x_val in x: #loop through x values
                y_pred.append(p(x_val))
            fig, ax = plt.subplots(1,1)
            ax.plot (x,result,'bo')
            ax.plot(x,y_pred,color = 'red')
       
        return result
                
    
    def calc_acc(self, mood_data_id):
        x = self.get_data_list(mood_data_id, 'AccelerometerX', 'average')
        y = self.get_data_list(mood_data_id, 'AccelerometerY', 'average')
        z = self.get_data_list(mood_data_id, 'AccelerometerZ', 'average')
        
        acc=[]
        
        for i in range(len(x)):
            mag = math.sqrt(x[i]**2+y[i]**2+z[i]**2)
            acc.append(mag)
        
        return acc
    
    def calc_avg(self, mood_data_id, data):
        """
        Get average of a specific data point over the whole meeting

        Args:
            mood_data_id: The mood data id to be observed (str)
            data: the parameter to be observed, e.g. Heartrate (str)

        Returns:
            a float value with the average value of e.g. Heartrate over the whole meeting
        """
        
        if data != 'Acceleration':
            data_list = self.get_data_list(mood_data_id, data, 'average')
        elif data == 'Acceleration':
            data_list = self.calc_acc(mood_data_id)
        count=0
        for i in range(0, len(data_list)):
            count +=data_list[i]
            
        result = count/len(data_list)
       
        return result


    def calc_var(self, mood_data_id, data):
        
        """
        Get variance of a specific data point over the whole meeting

        Args:
            mood_data_id: The mood data id to be observed (str)
            data: the parameter to be observed, e.g. Heartrate (str)

        Returns:
            a float value with the variance value of e.g. Heartrate over the whole meeting
        """
        if data != 'Acceleration':
            data_list = self.get_data_list(mood_data_id, data, 'average')
        elif data == 'Acceleration':
            data_list = self.calc_acc(mood_data_id)
        mean = self.calc_avg(mood_data_id, data)
        sq_val_list=[]

        for i in range (len(data_list)):
            sq_val=(data_list[i]-mean)**2
            sq_val_list.append(sq_val)
            
        total = sum(sq_val_list)
#        
        result = total/(len(data_list)-1)
        return result


    def get_productivity(self, mood_data_id, cat=True):
        
        """
        Get productivity for a specific mood_data_id

        Args:
            mood_data_id: The mood data id to be observed (str)

        Returns:
            an int value between 1 and 99 (99 best)
        """

        typ_nr = self.type_dict['answer_new_scale']
        list1 = self.rawdata[mood_data_id]
        prod = int(list1[0][typ_nr])
        
        if cat == True:
            if prod <=33:
                prod = 0
            elif prod > 33 and prod <= 66:
                prod = 1
            else:
                prod = 2

        return prod
    
    
class ControlData(object):
    
    def __init__(self, filename, sensorfilename):
        
        """
        Initialize a ControlData instance, which stores the control data points
        loaded from a given csv file specified by filename.

        Args:
            filename: name of the control data csv file (str)
            sensorfilename: name of the sensor data csv file (str)
        
        """
        
        self.rawdata={}
        sensordata = SensorData(sensorfilename)
        self.meeting = Meeting(sensorfilename)
        
        f = open(filename, 'r') #reads file with sensor data
        header = f.readline().split(';') #converts header into a list

        for line in f: #reads line after line from csv file (excluding the header)
            items = line.split(';') #splits the sinlge line and temporarily stores it as a list
            mood_data_id = items[header.index('mood_data_id')]
            if mood_data_id in sensordata.rawdata:
                self.rawdata[mood_data_id] = []
        
        
        f = open(filename, 'r')
        header = f.readline().split(';')
        for line in f:
            items = line.split(';')
            mood_data_id = items[header.index('mood_data_id')]
            if mood_data_id in self.rawdata:
                self.rawdata[mood_data_id].append(items)
        
        f.close()
        
    def get_gender(self, mood_data_id, num=True):
        
        """
        Get gender for a specific mood_data_id

        Args:
            mood_data_id: The mood data id to be observed (str)
            num: True if gender should be returned on a number scale, False otherwise

        Returns:
            The gender as an int value if num==True, as a string otherwise
        """
        
        list1 = self.rawdata[mood_data_id]
        
        result = 100
        
        for l in list1:
            for i in range (len(l)):
                if l[2] == '28':
                    result = int(l[3])
        
        if num ==False:
            if result == 0:
                return 'male'
            elif result == 1:
                return 'female'
            elif result == 2:
                return 'female' #actually gender category was undefined, but only 1 observation fell under this category.
                                #The respective participant pressed this option by mistake and directly manually corrected the gender to female.
            else:
                return 'no gender data available'
        else:
            if result == 0:
                return 1 #spin so that 1 = male, and 0 = female
            elif result == 1:
                return 0 #spin
            elif result == 2: #see explaination above
                return 0
            else:
                return None   
        
    def get_female_ratio_in_meeting(self, mood_data_id):
        """
        Gets female ration for a specific meeting_no

        Args:
            meeting_no: meeting id (int)

        Returns:
            The female ratio compared to all participants in the meeting
        """
        meeting_dict = self.meeting.get_meeting_dict()
        for key in meeting_dict:
            if mood_data_id in meeting_dict[key]:
                list1 = meeting_dict[key]

        count = 0
        for item in list1:
            if self.get_gender(item, False) == 'female':
                count +=1
        
        ratio = count/len(list1)
        
        return ratio
                    
    
    def get_years_in_company(self, mood_data_id, num=True):
        
        """
        Gets the years spent with the company for a given mood_data_id

        Args:
            mood_data_id: The mood data id to be observed (str)
            num: True if gender should be returned on a number scale, False otherwise

        Returns:
            The years that a person (with the specific mood data id) spent within a company as int or str
        """
        
        list1 = self.rawdata[mood_data_id]
        
        result = 100
        
        for l in list1:
            for i in range (len(l)):
                if l[2] == '29': #refers to question 29
                    result = int(l[3])
        
                
        if num == False:
            if result == 0:
                return 'less than 5 years'
            elif result == 1:
                return 'more than 5 years' 
            elif result == 2:
                return 'more than 5 years' 
            else:
                return 'no employment data available'
        else:
            if result == 0:
                return 0
            elif result == 1:
                return 1
            elif result == 2:
                return 1 
            else:
                return None
    
    def get_senior_ratio_in_meeting(self, mood_data_id):
        """
        Gets ratio of senior employees in a specific meeting

        Args:
            meeting_no: meeting id (int)

        Returns:
            The senior ratio compared to all participants in the meeting
        """        
        meeting_dict = self.meeting.get_meeting_dict()
        for key in meeting_dict:
            if mood_data_id in meeting_dict[key]:
                list1 = meeting_dict[key]
        count = 0
        for item in list1:
            if self.get_years_in_company(item, False) =='more than 5 years':
                count +=1
        
        ratio = count/len(list1)
        
        return ratio
        
    def get_employee_level(self, mood_data_id, num=True):

        """
        Gets employee level for a given mood_data_id

        Args:
            mood_data_id: The mood data id to be observed (str)
            num: True if gender should be returned on a number scale, False otherwise

        Returns:
            Employee level as int or str
        """
        list1 = self.rawdata[mood_data_id]
        
        result = 100
        
        for l in list1:
            for i in range (len(l)):
                if l[2] == '30': #refers to question 30
                    result = int(l[3])     
                    
                    
        if num == False:       
            if result == 0:
                return 'in managing position'
            elif result == 1:
                return 'not in managing position'
            else:
                return 'no employee level data available'
        else:
            if result == 0:
                return 1 #spin binary variables as dummy. Managing position should be 1
            elif result == 1:
                return 0
            else:
                return None        
    
    def get_manager_ratio_in_meeting(self, mood_data_id):
        
        """
        Gets ratio of senior employees in a specific meeting

        Args:
            meeting_no: meeting id (int)

        Returns:
            The senior ratio compared to all participants in the meeting
        """      

        meeting_dict = self.meeting.get_meeting_dict()
        for key in meeting_dict:
            if mood_data_id in meeting_dict[key]:
                list1 = meeting_dict[key]

        count = 0
        for item in list1:
            if self.get_employee_level(item, False) =='in managing position':
                count +=1
        
        ratio = count/len(list1)
        
        return ratio
    

    
    def get_meeting_participants(self, mood_data_id):
        
        """
        Gets participants in meeting

        Args:
            meeting_no: mood_data_id

        Returns:
            The number of participants in a specific meeting
        """      

        meeting_dict = self.meeting.get_meeting_dict()
        for key in meeting_dict:
            if mood_data_id in meeting_dict[key]:
                list1 = meeting_dict[key]

        count = 0
        for item in list1:
                count +=1
        
        return count
    

class Meeting(object):
    

    def __init__(self, sensordata):
        
        """
        Initialize a Meeting instance
        loaded from a given csv sensordata file specified by filename.

        Args:
            filename: name of the csv file (str)
        
        """
        self.sensordata = SensorData(sensordata)
        
    
    def get_meeting_dict(self):
        
        """
        Gets a dictionary with meeting_nos and the corresponding mood_id_datas

        Args:
            None

        Returns:
            A dictionatry organized by meeting no with corresponding mood_data_ids stored in list
        """    
        
        meeting_dict={}
        for key in self.sensordata.rawdata:
            meeting_dict[key]=(int(self.sensordata.get_data_list(key, 'Heartrate', 'meeting_no', toShow = False)[0]))#kann auch AccelerometerY sein, ist egal
        
        result = {}
        

        for key in meeting_dict:
            val = meeting_dict[key]
            list1=[]
            for key2 in meeting_dict:
                if val == meeting_dict[key2]:
                    list1.append(key2)
            result[val] = list1
            
        return result
    
    def get_meeting_no(self, mood_data_id):
        
        meeting_no=0
        meeting_dict = self.get_meeting_dict()
        
        for key in meeting_dict:
            for ident in meeting_dict[key]:
                if ident == mood_data_id:
                    meeting_no = key
                    
        return meeting_no



def create_pandas_table(sensordatafile, controldatafile):
    
    """
        Creates a pandas table comprising all observations with their
        corresponding sensor data but without productivity rating
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables

        Returns:
            pandas table
    """
    
    sensordata = SensorData(sensordatafile)
    controldata = ControlData(controldatafile, sensordatafile)
    meeting = Meeting(sensordatafile)
    
    mood_data_id = list(sensordata.rawdata.keys())
    bpm_avg = []
    bpm_var = []
    acc_avg = []
    acc_var = []
    mic_avg=[]
    mic_var=[]
    gender = []
    senior = []
    level = []
    female_ratio=[]
    senior_ratio=[]
    manager_ratio=[]
    meeting_no=[]
    participants = []
    
    for ident in mood_data_id:
        bpm_avg.append(sensordata.calc_avg(ident, 'Heartrate'))
        bpm_var.append(sensordata.calc_var(ident, 'Heartrate'))
        acc_avg.append((sensordata.calc_avg(ident, 'Acceleration')))
        acc_var.append(sensordata.calc_var(ident, 'Acceleration'))
        mic_avg.append((sensordata.calc_avg(ident, 'Microphone')))
        mic_var.append(sensordata.calc_var(ident, 'Microphone'))
        female_ratio.append(controldata.get_female_ratio_in_meeting(ident))
        senior_ratio.append(controldata.get_senior_ratio_in_meeting(ident))
        manager_ratio.append(controldata.get_manager_ratio_in_meeting(ident))
        gender.append(controldata.get_gender(ident, True))
        senior.append(controldata.get_years_in_company(ident, True))
        level.append(controldata.get_employee_level(ident, True))
        participants.append(controldata.get_meeting_participants(ident))
        meeting_no.append(meeting.get_meeting_no(ident))
        
    
    
    
    
    data = {'bpm_avg': bpm_avg, 'bpm_var': bpm_var,
            'acc_avg': acc_avg, 'acc_var': acc_var,
            'mic_avg': mic_avg, 'mic_var': mic_var,
            'gender': gender, 'senior': senior, 'manager': level,
            'participants': participants, 'meeting_no': meeting_no}
    
    

    df = pd.DataFrame(data, index = mood_data_id)
    
    imputer = IterativeImputer(random_state=42)
    imputed = imputer.fit_transform(df)
    df_imputed = pd.DataFrame(imputed,index = mood_data_id, columns=df.columns)
    
    df_imputed['gender'] = abs(df_imputed['gender'].round())
    df_imputed['senior'] = abs(df_imputed['senior'].round())
    df_imputed['manager'] = abs(df_imputed['manager'].round())
    
    df_imputed = df_imputed.drop(df_imputed[df_imputed.participants <= 1].index)
    
    #return df_imputed
    return df

def create_pandas_table_with_prod(sensordatafile, controldatafile, scaled=False):
    
    """
        Creates a pandas table comprising all observations with their
        corresponding sensor data and with productivity rating -> this is further used in R
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables (gender, senior, manager)
            scaled: performs z transformation (=standardization) with non-categorial variables if True, False otherwise

        Returns:
            pandas table with sensor data, control data, and productivity data
    """  
    
    sensordata = SensorData(sensordatafile)
    controldata = ControlData(controldatafile, sensordatafile)
    mood_data_id = list(sensordata.rawdata.keys())
    df= create_pandas_table(sensordatafile,controldatafile)
    target1=[]
    target2=[]
    
    data_top = df.index.values.tolist()
    #data_top = list(df.index.values)
    

    for item in data_top:
        target1.append(sensordata.get_productivity(item, False)) #0-100
        target2.append(sensordata.get_productivity(item, True)) #0, 1, 2
    
    target1 = pd.DataFrame(target1, index = data_top, columns=['prod_scale'])
    target2 = pd.DataFrame(target2, index = data_top, columns=['prod_cat'])

    
    df3 = pd.merge(df, target1, left_index=True, right_index=True)
    df4 = pd.merge(df3, target2, left_index=True, right_index=True)
    
    
    if scaled == True: #scaled performs z transformation

        mood_id_index=df.index
        
        df5 = df4.loc[:, 'bpm_avg':'mic_var']
        df6 = df4.loc[:, 'gender':'prod_cat']
        
        names = df5.columns
        scaler = preprocessing.StandardScaler()

        scaled_df = scaler.fit_transform(df5)
        scaled_df = pd.DataFrame(scaled_df, columns=names, index = mood_id_index)
        
        scaled_df = pd.merge(scaled_df, df6, left_index=True, right_index=True)
        
        df4 = scaled_df
    
    

    return df4

    
def rf_classifier(sensordatafile, controldatafile, tuned = True):
    
    """
        Generates a random forest classifier with or without tuned parameters.
        Additionally, creates features (body signals) and labels (prod values) arrays as
        well as a list with the feature names (e.g., bpm_avg, bpm_var, acc_avg, etc.)
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables (gender, senior, manager)
            tuned: specifies if the random forest classifier should be generated with tuned hyperparameters
            
        Returns:
            a tuple including the rf classifier, the features array, the labels array, and the feature list
    """ 
    
    features = create_pandas_table_with_prod(sensordatafile, controldatafile, scaled = True)
    

    features=features.dropna()
    
    features["gender"] = features["gender"].astype('category')
    features["senior"] = features["senior"].astype('category')
    features["manager"] = features["manager"].astype('category')
    features["prod_cat"] = features["prod_cat"].astype('category')
    

    labels = np.array(features['prod_cat']) # Labels are the values we want to predict
    features = features.drop('prod_scale', axis = 1)# Remove the labels from the features
    features = features.drop('prod_cat', axis = 1)
    features = features.drop('meeting_no', axis = 1)

    feature_list = list(features.columns)
    features = np.array(features) # Convert to numpy array
 
    
    if tuned ==False:  
        rf = RandomForestClassifier(n_estimators = 100, max_depth=None, max_features='auto', random_state = 42) #classifier with default parameters
    else:
        rf = RandomForestClassifier(n_estimators = 1400, max_depth=100, max_features='auto', random_state = 42) #tuned parameters: process see below
        
        #---------------- HYPERPARAMETER TUNING --------------------
    
    '''
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
    max_depth.append(None)
    random_grid = {
     'n_estimators': n_estimators,
     'max_features': max_features,
     'max_depth': max_depth
     }
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(features, labels)
    print(rf_random.best_params_)
    '''
    
    #---------------- RESULT OF HYPERP. TUNING: n_estimators: 1400, max_features: auto, max_depth: 100
    
    return rf, features, labels, feature_list

def gini_importance(sensordatafile, controldatafile, tuned = True):  
    
    """
        Prints the Gini importance (GI) values for each estimator (class)
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables (gender, senior, manager)
            tuned: specifies if the random forest classifier should be generated with tuned hyperparameters
            
        Returns:
            None, but prints the feature importances for each of the k = 3 splits
    """ 
    
    rf = rf_classifier(sensordatafile, controldatafile, tuned)[0]
    features = rf_classifier(sensordatafile, controldatafile, tuned)[1]
    labels = rf_classifier(sensordatafile, controldatafile, tuned)[2]
    feature_list = rf_classifier(sensordatafile, controldatafile, tuned)[3]
    output = cross_validate(rf, features, labels, cv=3, scoring = 'accuracy', return_estimator =True)
    for idx,estimator in enumerate(output['estimator']):
        print("Features sorted by their score for estimator {}:".format(idx))
        feature_importances = pd.DataFrame(estimator.feature_importances_,
                                        index = feature_list,
                                        columns=['importance'])
        print(feature_importances)

    
def permutation_accuracy_importance(sensordatafile, controldatafile, tuned = True):

    """
        Prints the average permutation accuracy importance (PAI) values over k = 3 splits
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables (gender, senior, manager)
            tuned: specifies if the random forest classifier should be generated with tuned hyperparameters
            
        Returns:
            None, but prints the average PAI for the k = 3 splits, Note: prints a HTML code,
            as IDE Spyder cannot visualize  HTML code, it has to be inserted into a html visualizer,
            e.g., https://codebeautify.org/htmlviewer/
    """ 
    
    rf = rf_classifier(sensordatafile, controldatafile, tuned)[0]
    features = rf_classifier(sensordatafile, controldatafile, tuned)[1]
    labels = rf_classifier(sensordatafile, controldatafile, tuned)[2]
    feature_list = rf_classifier(sensordatafile, controldatafile, tuned)[3]
    perm = PermutationImportance(rf, random_state=42, cv=3).fit(features, labels)
    display(eli5.show_weights(perm, feature_names = feature_list).data)


    
def conf_matrix(sensordatafile, controldatafile, tuned = True):

    """
        Prints the confusion matrix for the random forest classification
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables (gender, senior, manager)
            tuned: specifies if the random forest classifier should be generated with tuned hyperparameters
            
        Returns:
            None, but prints the confusion matrix
    """ 

    rf = rf_classifier(sensordatafile, controldatafile, tuned)[0]
    features = rf_classifier(sensordatafile, controldatafile, tuned)[1]
    labels = rf_classifier(sensordatafile, controldatafile, tuned)[2]
    labels_pred = cross_val_predict(rf, features, labels, cv = 3)
    conf_mat = confusion_matrix(labels, labels_pred)
    print (conf_mat)
    
    
def classifier_performance(sensordatafile, controldatafile, tuned = True):
    
    """
        Prints the performance values precision, recall, f-score, and accuracy
        
        Args:
            sensordatafile: csv file with sensor data inputs (heart rate, arm acceleration, speech intensity)
            controldatfile: csv file with information about control variables (gender, senior, manager)
            tuned: specifies if the random forest classifier should be generated with tuned hyperparameters
            
        Returns:
            None, but prints the performance metrics
    """ 
    
    rf = rf_classifier(sensordatafile, controldatafile, tuned)[0]
    features = rf_classifier(sensordatafile, controldatafile, tuned)[1]
    labels = rf_classifier(sensordatafile, controldatafile, tuned)[2]
    labels_pred = cross_val_predict(rf, features, labels, cv = 3)
    print(metrics.classification_report(labels, labels_pred, digits=3))
    print(metrics.precision_recall_fscore_support(labels, labels_pred, average = 'micro'))
    print(metrics.accuracy_score(labels, labels_pred))
    
    
    

#---------------------RUN--------------------------- Uncomment to run, remove #



    
sensordatafile = '2.csv'
controldatafile= 'generic_sample_answers.csv'
#dfp=create_pandas_table_with_prod(sensordatafile, controldatafile, scaled = False)
#df=create_pandas_table(sensordatafile, controldatafile)
#print (df)
#print(dfp)
#df.to_csv('2020_03_23_notscaled.csv', index = True)
#dfp.to_csv('2022_04_06_notscaled.csv', index = True)
#gini_importance(sensordatafile, controldatafile, tuned = True)
permutation_accuracy_importance(sensordatafile, controldatafile, tuned = True)
#conf_matrix(sensordatafile, controldatafile, tuned = True)
#classifier_performance(sensordatafile, controldatafile, tuned = False)
#rf = rf_classifier(sensordatafile,controldatafile)




 
      